Applying our zero-inflated model to the data from Zeisel et al. (2015). The data are from the mouse cortex and hippocampus. We have information about the tissue of origin and about the clusters found by the authors.
I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it should not work other than on Davide’s computer.
data <- read.table("expression_mRNA_17-Aug-2014.txt", sep='\t', stringsAsFactors = FALSE, comment.char = '%')
tissue <- as.factor(as.matrix(data)[1,-(1:2)])
table(tissue)
## tissue
## ca1hippocampus sscortex
## 1314 1691
group <- as.factor(as.matrix(data)[2,-(1:2)])
table(tissue, group)
## group
## tissue 1 2 3 4 5 6 7 8 9
## ca1hippocampus 126 20 919 121 14 18 64 17 15
## sscortex 164 370 29 699 84 157 134 9 45
nmolecule <- as.numeric(as.matrix(data)[3,-(1:2)])
sex <- as.factor(as.matrix(data)[5,-(1:2)])
table(tissue, sex)
## sex
## tissue -1 0 1
## ca1hippocampus 748 46 520
## sscortex 776 0 915
level1 <- as.factor(as.matrix(data)[9,-(1:2)])
level2 <- as.factor(as.matrix(data)[10,-(1:2)])
table(level1)
## level1
## astrocytes_ependymal endothelial-mural interneurons
## 224 235 290
## microglia oligodendrocytes pyramidal CA1
## 98 820 939
## pyramidal SS
## 399
table(level1, level2)
## level2
## level1 (none) Astro1 Astro2 CA1Pyr1 CA1Pyr2 CA1PyrInt
## astrocytes_ependymal 65 68 61 0 0 0
## endothelial-mural 15 0 0 0 0 0
## interneurons 0 0 0 0 0 0
## microglia 0 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0
## pyramidal CA1 0 0 0 380 447 49
## pyramidal SS 109 0 0 0 0 0
## level2
## level1 CA2Pyr2 Choroid ClauPyr Epend Int1 Int10 Int11
## astrocytes_ependymal 0 10 0 20 0 0 0
## endothelial-mural 0 0 0 0 0 0 0
## interneurons 0 0 0 0 12 21 10
## microglia 0 0 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0 0
## pyramidal CA1 41 0 0 0 0 0 0
## pyramidal SS 0 0 5 0 0 0 0
## level2
## level1 Int12 Int13 Int14 Int15 Int16 Int2 Int3 Int4 Int5
## astrocytes_ependymal 0 0 0 0 0 0 0 0 0
## endothelial-mural 0 0 0 0 0 0 0 0 0
## interneurons 21 15 22 18 20 24 10 15 20
## microglia 0 0 0 0 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0 0 0 0
## pyramidal CA1 0 0 0 0 0 0 0 0 0
## pyramidal SS 0 0 0 0 0 0 0 0 0
## level2
## level1 Int6 Int7 Int8 Int9 Mgl1 Mgl2 Oligo1 Oligo2 Oligo3
## astrocytes_ependymal 0 0 0 0 0 0 0 0 0
## endothelial-mural 0 0 0 0 0 0 0 0 0
## interneurons 22 23 26 11 0 0 0 0 0
## microglia 0 0 0 0 17 16 0 0 0
## oligodendrocytes 0 0 0 0 0 0 45 98 87
## pyramidal CA1 0 0 0 0 0 0 0 0 0
## pyramidal SS 0 0 0 0 0 0 0 0 0
## level2
## level1 Oligo4 Oligo5 Oligo6 Peric Pvm1 Pvm2 S1PyrDL
## astrocytes_ependymal 0 0 0 0 0 0 0
## endothelial-mural 0 0 0 21 0 0 0
## interneurons 0 0 0 0 0 0 0
## microglia 0 0 0 0 32 33 0
## oligodendrocytes 106 125 359 0 0 0 0
## pyramidal CA1 0 0 0 0 0 0 0
## pyramidal SS 0 0 0 0 0 0 81
## level2
## level1 S1PyrL23 S1PyrL4 S1PyrL5 S1PyrL5a S1PyrL6 S1PyrL6b
## astrocytes_ependymal 0 0 0 0 0 0
## endothelial-mural 0 0 0 0 0 0
## interneurons 0 0 0 0 0 0
## microglia 0 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0
## pyramidal CA1 0 0 0 0 0 0
## pyramidal SS 74 26 16 28 39 21
## level2
## level1 SubPyr Vend1 Vend2 Vsmc
## astrocytes_ependymal 0 0 0 0
## endothelial-mural 0 32 105 62
## interneurons 0 0 0 0
## microglia 0 0 0 0
## oligodendrocytes 0 0 0 0
## pyramidal CA1 22 0 0 0
## pyramidal SS 0 0 0 0
counts <- as.matrix(data[12:NROW(data),-(1:2)])
counts <- matrix(as.numeric(counts), ncol=ncol(counts), nrow=nrow(counts))
rownames(counts) <- data[12:NROW(data),1]
colnames(counts) <- data[8, -(1:2)]
To speed up the computations, I will focus on the top 1,000 most variable genes.
vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]
First, let’s look at PCA (of the log counts) for reference.
par(mfrow=c(2, 2))
detection_rate <- colSums(core>0)
coverage <- colSums(core)
col1 <- brewer.pal(9, "Set1")
col2 <- brewer.pal(8, "Set2")
colTissue <- col1[tissue]
colMerged <- col2[level1]
pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=20, main="PCA of log-counts, centered not scaled")
legend("topleft", levels(level1), fill=col2)
plot(pca$x, col=colTissue, pch=20, main="PCA of log-counts, centered not scaled")
fq <- EDASeq::betweenLaneNormalization(counts, which="full")
pcafq <- prcomp(t(log1p(fq[rownames(core),])))
plot(pcafq$x, col=colMerged, pch=20, main="PCA of FQ log-counts, centered not scaled")
plot(pcafq$x, col=colTissue, pch=20, main="PCA of FQ log-counts, centered not scaled")
The first plot is the unnormalized data and the second plot is after full-quantile normalization, which is what Russell and Diya used for the paper.
They found that to fully explain the differences between clusters, we need three dimensions.
pairs(pcafq$x[,1:3], col=colMerged, pch=20, main="PCA of FQ log-counts, centered not scaled")
df <- data.frame(PC1=pcafq$x[,1], PC2=pcafq$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=20, col=colMerged)
print(cor(df, method="spearman"))
## PC1 PC2 detection_rate coverage
## PC1 1.0000000 0.1934101 -0.8208739 -0.6525198
## PC2 0.1934101 1.0000000 0.2749546 0.3998289
## detection_rate -0.8208739 0.2749546 1.0000000 0.8967131
## coverage -0.6525198 0.3998289 0.8967131 1.0000000
Even after full-quantile normalization, there is some residual correlation between PC2 and detection rate.
print(system.time(zinb_Vall <- zinbFit(core, ncores = ncores, K = 3)))
## user system elapsed
## 2485.394 537.528 421.206
pairs(zinb_Vall@W, col=colMerged, pch=20, main="ZINB")
pairs(zinb_Vall@W, col=colTissue, pch=20, main="ZINB")
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], W3=zinb_Vall@W[,3], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=20, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 W3 gamma_mu
## W1 1.000000000 0.006476424 0.01376237 0.63967883
## W2 0.006476424 1.000000000 0.06323623 -0.06000655
## W3 0.013762371 0.063236226 1.00000000 -0.09798784
## gamma_mu 0.639678834 -0.060006550 -0.09798784 1.00000000
## gamma_pi -0.709274239 0.131792434 0.04380961 -0.79807959
## detection_rate 0.770531522 -0.161658393 -0.07725217 0.90629270
## coverage 0.584275001 -0.151893464 -0.10906387 0.97166090
## gamma_pi detection_rate coverage
## W1 -0.70927424 0.77053152 0.5842750
## W2 0.13179243 -0.16165839 -0.1518935
## W3 0.04380961 -0.07725217 -0.1090639
## gamma_mu -0.79807959 0.90629270 0.9716609
## gamma_pi 1.00000000 -0.94897403 -0.8232345
## detection_rate -0.94897403 1.00000000 0.8967131
## coverage -0.82323454 0.89671312 1.0000000
print(system.time(zinb_4 <- zinbFit(core, ncores = ncores, K = 4)))
## Warning in simpute.als(x, J, thresh, lambda, maxit, trace.it, warm.start, :
## Convergence not achieved by 100 iterations
## user system elapsed
## 5497.948 850.268 792.701
pairs(zinb_4@W, col=colMerged, pch=20, main="ZINB by cluster (level 1)")
pairs(zinb_4@W, col=colTissue, pch=20, main="ZINB by tissue")
pairs(zinb_4@W, col=col2[group], pch=20, main="ZINB by group")
col3 <- c(rgb(0, 0, 0, alpha = 0), read.table("colors.txt", stringsAsFactors = FALSE, comment.char = "%")[,1])
plot(zinb_Vall@W, col=col3[level2], pch=20, main="ZINB")
plot(pcafq$x, col=col3[level2], pch=20, main="PCA (FQ)")
i <- 7
idx <- which(level1==levels(level1)[i])
print(system.time(zinb_sub <- zinbFit(core[,idx], ncores = ncores, K = 2)))
## user system elapsed
## 231.858 113.440 44.999
plot(zinb_sub@W, xlab="W1", ylab="W2", pch=19, col=col3[level2[idx]])
legend("bottomright", levels(droplevels(level2[idx])), fill=unique(col3[level2[idx]]))
i <- 6
idx <- which(level1==levels(level1)[i])
print(system.time(zinb_sub <- zinbFit(core[,idx], ncores = ncores, K = 2)))
## user system elapsed
## 662.902 259.453 132.388
plot(zinb_sub@W, xlab="W1", ylab="W2", pch=19, col=col3[level2[idx]])
legend("bottomright", levels(droplevels(level2[idx])), fill=unique(col3[level2[idx]]))